*********************************************
*	Author: Rithika Kumar	                *
*   GOAL: Setting up data for Appendix F1   *        
*   Table A12                               * 
*********************************************

********************************************************************************************************
* NOTE: Compiling Data from IHDS individual and household datasets for the salary analysis. 
* This data is not in the EW dataser and needs to be complied by merging multiple datasets 
* from the IHDS 
* I use the EW file: main_ew_hh_df.dta as the base and use it to match with hhouseholds and invidiual
* IDs in the other files 
********************************************************************************************************

* Set working directory to "JOP Replication files" folder on your computer 

clear

** I will use the data from the merged HH data (merged directly by IHDS) 

use "DATA FILES TO SHARE/IHDS_RAW/37382-0011-Data.dta"

** Keep only the relevant variables you are interested in

keep SURVEY HHBASE HHFAM2 STATEID DISTID PSUID HHID HHSPLITID   HHID2005 HHID2012 HHSPLITID2005 HHSPLITID2012 HHWAVES STDIST01 PSUWAVES URBAN URBAN2001 URBAN4 URBAN4_2001 URBAN2011 URBAN4_2011 METRO METRO6 INCCROP INCAGPROP INCANIMAL INCAG WT2005 FWT2012 FWT2005 FM1

** create a variable with the year of the survey 
gen year_round = 0 
replace year_round = 2005 if SURVEY == 1
replace year_round = 2012 if SURVEY == 2

** create leading zeros because when we concat it won't cause an isue 
format DISTID %02.0f
format PSUID %02.0f
format HHID %03.0f
format HHSPLITID %02.0f

***NOTE THAT WHEN THE HHSPLIT in the seccond wave, the value from the unsplit hh in wave 1 is repeated for the split HH's wave 1 value
** HHSPLIT notes the split ID for the HH 


** I am going to create a unique ID for each HH - HHSPLITID contains the unique split ID for the HH (over the teo waves - they stanndardized it if it split) 
gen str11 hhuid_noyr = string(STATEID,"%02.0f") + string(DISTID,"%02.0f") + string(PSUID,"%02.0f") + string(HHID,"%03.0f") + string(HHSPLITID,"%02.0f")
** this is just to check the unique IDS - we should have two of each if we have unique IDs

egen x = group(hhuid_noyr)
sum x

/*
    Variable |        Obs        Mean    Std. dev.       Min        Max
-------------+---------------------------------------------------------
           x |     80,036     20009.5    11552.27          1      40018 -- number of households 

*/

save "DATA FILES TO SHARE/TEMP_FILES/farm_work_tomerge.dta", replace 

clear
** use this data to get the data on the eligible women 


** create two unique ID - one with HHID2005 and one with HHID simple 

use "DATA FILES TO SHARE/IHDS_RAW/37382-0017-Data.dta", clear
** create a variable with the year of the survey 
gen year_round = 0 
replace year_round = 2005 if SURVEY == 1
replace year_round = 2012 if SURVEY == 2

tab year_round


** So here there is basically only one woman from each HHID - so I wll just drop the valuues from round 1 and 
** ccreare a new ID that will include the HHSPLITID2012 (since in this df they have made the HHSPLIT var is not unique to each indiv- and it might vary based on the value in 2005 and 2012
** while the houehold .dta file in the HHSPLIT inccludes the split ID from 2012. so I will just use that. 

drop if year_round == 2005

** create leading zeros because when we concat it won't cause an isue 
format DISTID %02.0f
format PSUID %02.0f
format HHID2012 %03.0f
format HHSPLITID2012 %02.0f


** Unique household id for merging it with the hhousuehold income file collected earlier 
gen str11 hhuid_noyr = string(STATEID,"%02.0f") + string(DISTID,"%02.0f") + string(PSUID,"%02.0f") + string(HHID2012,"%03.0f") + string(HHSPLITID2012,"%02.0f")

egen x = group(hhuid_noyr)
sum x

keep  IDPSU IDPERSON  hhuid_noyr EW5 EWELIGIBLE ID11 GROUPS6 GROUPS EWQELIGIBLE

merge 1:m hhuid_noyr using "DATA FILES TO SHARE/TEMP_FILES/farm_work_tomerge.dta"

*** we have 50958 that matched - basically all the ones from the EW found a corresponding household.
** the remaining remain unmatched since there could be multiple splits within a HH and they interview only one woman frome each HH 
** so we might have a split hh that is not interviewed but you still get data on inccome from them like for example 


keep if _merge==3 
tab year_round

** Unique ID for the HH using HHID2005 bc the EW df you have is based on 2005 data 

** create leading zeros because when we concat it won't cause an isue 

format HHID2005 %02.0f
format HHSPLITID2005 %02.0f

** create a uid with the state, dist, psu and hhid2005 

gen str14 hhuid_2005 = string(STATEID,"%02.0f") + string(DISTID,"%02.0f") + string(PSUID,"%02.0f") + string(HHID2005,"%02.0f") + string(HHSPLITID2005,"%02.0f") + string(year_round,"%04.0f")

egen y = group(hhuid_2005)
sum y
drop _merge
save "DATA FILES TO SHARE/TEMP_FILES/farm_work_EW.dta", replace 

clear
use "DATA FILES TO SHARE/main_ew_hh_df"

gen year_round = 0
replace year_round = 2005 if year == 0
replace year_round = 2012 if year == 1

format STATEID %02.0f
format DISTID %02.0f
format PSUID %02.0f
format HHID2005 %02.0f
format HHSPLITID2005 %02.0f
** create a uid with the state, dist, psu and hhid2005 

gen str14 hhuid_2005 = string(STATEID,"%02.0f") + string(DISTID,"%02.0f") + string(PSUID,"%02.0f") + string(HHID2005,"%02.0f") + string(HHSPLITID2005,"%02.0f") + string(year_round,"%04.0f")


egen y = group(hhuid_2005)
sum y



merge 1:1 hhuid_2005 using "DATA FILES TO SHARE/TEMP_FILES/farm_work_EW.dta"

*about 49,730  merge and others do not 


keep if _merge == 3

save "DATA FILES TO SHARE/TEMP_FILES/farm_work_EW_fordid.dta", replace 

keep year INCAG w2_abshusband_dummy URBAN hhuid_num log_inc DISTID did_sample vill_id_num

save "DATA FILES TO SHARE/farm_work_EW.dta", replace







